I have chosen to work on case 3, but had to complete Case 2 beforehand. Unfortunately, the time wasn't enough to compute all of the features for Case 3. Therefore, after I have applied a Lasso regression I have also tested the case with a random forest regressor, which performed better in my case (probably because I haven't computed the data exactly like in the articles).
In that sense, on that question in the task How can the results of the article be expanded/developed/upgraded? I would suggest implementing random forest or decision three.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import walk, getcwd
In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later. In C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
f = []
for (dirpath, dirnames, filenames) in walk(getcwd()):
f.extend(filenames)
break
import pyreadstat
def read_spss(module, f = f):
files = [i for i in f if module in i]
if len(files) == 1:
return pyreadstat.read_sav(files[0], apply_value_formats=True,
formats_as_category=True, formats_as_ordered_category=False)
else: raise Exception('More than one file!')
ac, ac_meta = read_spss('ac')
#mh, mh_meta = read_spss('mh')
ph, ph_meta = read_spss('ph')
#dn, dn_meta = read_spss('dn')
ep, ep_meta = read_spss('ep')
#co, co_meta = read_spss('co')
gv_imputations, gv_imputations_meta = read_spss('gv_imputations')
def get_desc_df(df, meta):
desc = pd.DataFrame({
"var":meta.column_names,
"varlab":meta.column_labels,
"values":list(meta.readstat_variable_types.values()),
"uniques":[len(set(df[i])) for i in df],
'nas': list(df.isna().sum()),
})
return desc
ac_desc = get_desc_df(ac, ac_meta)
gov_dsec = get_desc_df(gv_imputations, gv_imputations_meta)
cols_to_use = [i for i in ac if (('ac036_' not in i) and ('ac037_' not in i) and ('ac038_' not in i) and ('ac040' not in i)
and ('hhid7' not in i) and ('mergeidp7' not in i) and ('coupleid7' not in i) and ('language' not in i))]
ac = ac[cols_to_use]
ac[ac['country'] == 'Romania'].isna().sum()
mergeid 0 country 0 ac012_ 5 ac014_ 5 ac015_ 5 ac016_ 5 ac017_ 5 ac018_ 5 ac019_ 5 ac020_ 5 ac021_ 2114 ac022_ 5 ac023_ 5 ac024_ 5 ac025_ 5 ac035d1 43 ac035d4 43 ac035d5 43 ac035d7 43 ac035d8 43 ac035d9 43 ac035d10 43 ac035dno 43 ac701_ 5 ac702_ 5 ac703_ 5 ac704_ 5 ac705_ 5 ac706_ 5 ac707_ 5 ac708_ 5 ac709_ 5 ac710_ 5 ac711_ 5 ac740_ 5 dtype: int64
ac['ac021_'].cat.add_categories(['Interpolated'], inplace = True)
ac = ac.apply(
lambda x: x.apply(
lambda y: np.nan if (y == "Don't know") or (y == "Refusal") else y
)
)
ac.loc[ac['country'] == 'Romania', ['ac021_']] = 'Interpolated'
ac[ac['country'] == 'Romania'].isna().sum()
mergeid 0 country 0 ac012_ 14 ac014_ 7 ac015_ 8 ac016_ 8 ac017_ 7 ac018_ 8 ac019_ 7 ac020_ 7 ac021_ 0 ac022_ 7 ac023_ 7 ac024_ 7 ac025_ 8 ac035d1 44 ac035d4 44 ac035d5 44 ac035d7 44 ac035d8 44 ac035d9 44 ac035d10 44 ac035dno 44 ac701_ 7 ac702_ 7 ac703_ 7 ac704_ 7 ac705_ 7 ac706_ 7 ac707_ 7 ac708_ 8 ac709_ 7 ac710_ 8 ac711_ 7 ac740_ 5 dtype: int64
ac = ac.dropna().reset_index(drop=True)
for i in ac:
ac[i] = ac[i].apply(lambda x: 1 if x == 'Not selected' else x)\
.apply(lambda x: 5 if x == 'Selected' else x)
ac[i] = ac[i].apply(lambda x: 1 if x == 'Never' else x)\
.apply(lambda x: 2 if x == 'Rarely' else x)\
.apply(lambda x: 3 if x == 'Sometimes' else x)\
.apply(lambda x: 4 if x == 'Often' else x)
ac[i] = ac[i].apply(lambda x: 1 if x == 'Disagree strongly' else x)\
.apply(lambda x: 2 if x == 'Disagree a little' else x)\
.apply(lambda x: 3 if x == 'Neither agree nor disagree' else x)\
.apply(lambda x: 4 if x == 'Agree a little' else x)\
.apply(lambda x: 5 if x == 'Agree strongly' else x)
gv_aim = gv_imputations[['mergeid', 'age', 'gender', 'single', 'mstat', 'yedu', 'cjs', 'thinc2']].dropna()
gv_aim.shape
(386010, 8)
gv_aim = gv_aim.groupby('mergeid').first()
gv_aim = gv_aim[gv_aim['age'] >= 50]
df = ac.merge(gv_aim, on='mergeid')
df_wage_life_satisfaction = df.groupby('country').agg({"thinc2":"median", "ac012_":"mean"})
df_wage_life_satisfaction.columns = ['Median Wage', 'Life Satisfaction']
df_wage_life_satisfaction
| Median Wage | Life Satisfaction | |
|---|---|---|
| country | ||
| Austria | 27600.000000 | 8.253930 |
| Belgium | 29557.368651 | 7.738783 |
| Bulgaria | 3681.357906 | 6.210499 |
| Croatia | 8038.908513 | 7.117545 |
| Cyprus | 18000.000000 | 7.897864 |
| Czech Republic | 10028.109012 | 7.599840 |
| Denmark | 43556.582857 | 8.536147 |
| Estonia | 9480.000000 | 6.888740 |
| Finland | 33600.000000 | 8.329060 |
| France | 27600.000000 | 7.450285 |
| Germany | 30000.000000 | 7.857905 |
| Greece | 12000.000000 | 7.049641 |
| Hungary | 7606.972999 | 6.874231 |
| Israel | 35448.770996 | 7.956146 |
| Italy | 20400.000000 | 7.578187 |
| Latvia | 6000.000000 | 6.624591 |
| Lithuania | 6000.000000 | 6.491956 |
| Luxembourg | 55181.852981 | 7.987478 |
| Malta | 12000.000000 | 8.167785 |
| Poland | 8456.659680 | 7.469256 |
| Portugal | 12000.000000 | 7.684457 |
| Romania | 3939.765389 | 7.367184 |
| Slovakia | 12000.000000 | 7.617378 |
| Slovenia | 14400.000000 | 7.424198 |
| Spain | 15600.000000 | 7.640291 |
| Sweden | 33627.049822 | 8.354010 |
| Switzerland | 53971.392346 | 8.432373 |
import plotly.express as px
fig = px.choropleth(df_wage_life_satisfaction.reset_index(), locations="country",
locationmode='country names', color="Median Wage",
hover_name="country", scope ='europe', color_continuous_scale="blues",
title='Median Wage')
fig.update(layout_coloraxis_showscale=True)
fig.show()
fig = px.choropleth(df_wage_life_satisfaction.reset_index(), locations="country",
locationmode='country names', color="Life Satisfaction",
hover_name="country", scope ='europe',color_continuous_scale="greens",
title='Life Satisfaction')
fig.update(layout_coloraxis_showscale=True)
fig.show()
es = df.groupby('cjs').agg({"ac012_":"mean", "ac016_":"mean", "ac017_":"mean"})
es.columns = ['LifeSat', 'LeftOut', 'DoWhatWant']
es.dropna().plot.barh()
<AxesSubplot:ylabel='cjs'>
df = df.apply(
lambda x: x.apply(
lambda y: np.nan if (y == "Interpolated") else y
)
)
df = df.dropna()
corr_df = df[['ac014_', 'ac015_', 'ac016_', 'ac017_', 'ac018_',
'ac019_', 'ac020_', 'ac021_', 'ac022_', 'ac023_',
'ac024_', 'ac025_']]
corr = corr_df.corr()
corr.style.background_gradient(cmap='coolwarm')
| ac014_ | ac015_ | ac016_ | ac017_ | ac018_ | ac019_ | ac020_ | ac021_ | ac022_ | ac023_ | ac024_ | ac025_ | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ac014_ | 1.000000 | 0.452976 | 0.386468 | -0.261215 | 0.140704 | 0.252529 | -0.206355 | -0.257534 | -0.154932 | -0.416880 | -0.344952 | -0.364573 |
| ac015_ | 0.452976 | 1.000000 | 0.503004 | -0.233042 | 0.198521 | 0.250057 | -0.224599 | -0.268606 | -0.176017 | -0.351900 | -0.296373 | -0.329618 |
| ac016_ | 0.386468 | 0.503004 | 1.000000 | -0.251771 | 0.206150 | 0.290144 | -0.263379 | -0.322571 | -0.230004 | -0.341738 | -0.345645 | -0.354189 |
| ac017_ | -0.261215 | -0.233042 | -0.251771 | 1.000000 | -0.047013 | -0.164002 | 0.281548 | 0.321422 | 0.232715 | 0.366042 | 0.348815 | 0.350667 |
| ac018_ | 0.140704 | 0.198521 | 0.206150 | -0.047013 | 1.000000 | 0.246160 | -0.068939 | -0.064999 | -0.064887 | -0.044516 | -0.061381 | -0.070154 |
| ac019_ | 0.252529 | 0.250057 | 0.290144 | -0.164002 | 0.246160 | 1.000000 | -0.173317 | -0.192027 | -0.193867 | -0.195230 | -0.272121 | -0.296930 |
| ac020_ | -0.206355 | -0.224599 | -0.263379 | 0.281548 | -0.068939 | -0.173317 | 1.000000 | 0.553536 | 0.361880 | 0.384247 | 0.413541 | 0.437354 |
| ac021_ | -0.257534 | -0.268606 | -0.322571 | 0.321422 | -0.064999 | -0.192027 | 0.553536 | 1.000000 | 0.450083 | 0.444719 | 0.474275 | 0.500598 |
| ac022_ | -0.154932 | -0.176017 | -0.230004 | 0.232715 | -0.064887 | -0.193867 | 0.361880 | 0.450083 | 1.000000 | 0.331900 | 0.383088 | 0.402166 |
| ac023_ | -0.416880 | -0.351900 | -0.341738 | 0.366042 | -0.044516 | -0.195230 | 0.384247 | 0.444719 | 0.331900 | 1.000000 | 0.562482 | 0.557246 |
| ac024_ | -0.344952 | -0.296373 | -0.345645 | 0.348815 | -0.061381 | -0.272121 | 0.413541 | 0.474275 | 0.383088 | 0.562482 | 1.000000 | 0.643846 |
| ac025_ | -0.364573 | -0.329618 | -0.354189 | 0.350667 | -0.070154 | -0.296930 | 0.437354 | 0.500598 | 0.402166 | 0.557246 | 0.643846 | 1.000000 |
corr_series = corr_df.corr().unstack().sort_values().drop_duplicates()
corr_series[corr_series > 0.3]
ac017_ ac021_ 0.321422 ac022_ ac023_ 0.331900 ac017_ ac024_ 0.348815 ac025_ ac017_ 0.350667 ac022_ ac020_ 0.361880 ac023_ ac017_ 0.366042 ac024_ ac022_ 0.383088 ac023_ ac020_ 0.384247 ac014_ ac016_ 0.386468 ac025_ ac022_ 0.402166 ac024_ ac020_ 0.413541 ac025_ ac020_ 0.437354 ac023_ ac021_ 0.444719 ac022_ ac021_ 0.450083 ac015_ ac014_ 0.452976 ac024_ ac021_ 0.474275 ac025_ ac021_ 0.500598 ac016_ ac015_ 0.503004 ac021_ ac020_ 0.553536 ac025_ ac023_ 0.557246 ac023_ ac024_ 0.562482 ac025_ ac024_ 0.643846 ac014_ ac014_ 1.000000 dtype: float64
corr_series[corr_series < -0.3]
ac014_ ac023_ -0.416880 ac025_ ac014_ -0.364573 ac016_ ac025_ -0.354189 ac015_ ac023_ -0.351900 ac024_ ac016_ -0.345645 ac014_ ac024_ -0.344952 ac023_ ac016_ -0.341738 ac015_ ac025_ -0.329618 ac021_ ac016_ -0.322571 dtype: float64
%load_ext rpy2.ipython
C:\Users\kiril\AppData\Roaming\Python\Python38\site-packages\rpy2\robjects\packages.py:366: UserWarning: The symbol 'quartz' is not in this R namespace/package.
# %%R -i corr_df
# install.packages("dendextend")
# library(dendextend)
# aac = cor(corr_df)
# scree(aac)
import pandas as pd
from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
fa = FactorAnalyzer()
fa.fit(corr_df)
eigen_values, vectors = fa.get_eigenvalues()
plt.plot(vectors+1,'-.')
plt.title('Scree Plot')
plt.xlabel('component number')
plt.ylabel('Eigen values')
Text(0, 0.5, 'Eigen values')
eigen_values
array([4.47569495, 1.38995794, 0.99747547, 0.79102704, 0.75501688,
0.6843992 , 0.63857954, 0.57629819, 0.47843893, 0.43596334,
0.42505564, 0.35209289])
fa.set_params(n_factors=2)
fa.fit(corr_df)
loadings = fa.loadings_
factors = fa.transform(corr_df)
factors_df = pd.DataFrame(factors)
corr_df['Factor_1'] = factors_df[0]
corr_df['Factor_2'] = factors_df[1]
<ipython-input-45-501b418dab6d>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-45-501b418dab6d>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
corr = corr_df.corr()
corr.style.background_gradient(cmap='coolwarm')
| ac014_ | ac015_ | ac016_ | ac017_ | ac018_ | ac019_ | ac020_ | ac021_ | ac022_ | ac023_ | ac024_ | ac025_ | Factor_1 | Factor_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ac014_ | 1.000000 | 0.452976 | 0.386468 | -0.261215 | 0.140704 | 0.252529 | -0.206355 | -0.257534 | -0.154932 | -0.416880 | -0.344952 | -0.364573 | -0.419530 | 0.640906 |
| ac015_ | 0.452976 | 1.000000 | 0.503004 | -0.233042 | 0.198521 | 0.250057 | -0.224599 | -0.268606 | -0.176017 | -0.351900 | -0.296373 | -0.329618 | -0.395943 | 0.746140 |
| ac016_ | 0.386468 | 0.503004 | 1.000000 | -0.251771 | 0.206150 | 0.290144 | -0.263379 | -0.322571 | -0.230004 | -0.341738 | -0.345645 | -0.354189 | -0.456781 | 0.712579 |
| ac017_ | -0.261215 | -0.233042 | -0.251771 | 1.000000 | -0.047013 | -0.164002 | 0.281548 | 0.321422 | 0.232715 | 0.366042 | 0.348815 | 0.350667 | 0.473680 | -0.372979 |
| ac018_ | 0.140704 | 0.198521 | 0.206150 | -0.047013 | 1.000000 | 0.246160 | -0.068939 | -0.064999 | -0.064887 | -0.044516 | -0.061381 | -0.070154 | -0.084269 | 0.316507 |
| ac019_ | 0.252529 | 0.250057 | 0.290144 | -0.164002 | 0.246160 | 1.000000 | -0.173317 | -0.192027 | -0.193867 | -0.195230 | -0.272121 | -0.296930 | -0.320230 | 0.447540 |
| ac020_ | -0.206355 | -0.224599 | -0.263379 | 0.281548 | -0.068939 | -0.173317 | 1.000000 | 0.553536 | 0.361880 | 0.384247 | 0.413541 | 0.437354 | 0.621347 | -0.310559 |
| ac021_ | -0.257534 | -0.268606 | -0.322571 | 0.321422 | -0.064999 | -0.192027 | 0.553536 | 1.000000 | 0.450083 | 0.444719 | 0.474275 | 0.500598 | 0.716329 | -0.374348 |
| ac022_ | -0.154932 | -0.176017 | -0.230004 | 0.232715 | -0.064887 | -0.193867 | 0.361880 | 0.450083 | 1.000000 | 0.331900 | 0.383088 | 0.402166 | 0.547563 | -0.275128 |
| ac023_ | -0.416880 | -0.351900 | -0.341738 | 0.366042 | -0.044516 | -0.195230 | 0.384247 | 0.444719 | 0.331900 | 1.000000 | 0.562482 | 0.557246 | 0.677795 | -0.517910 |
| ac024_ | -0.344952 | -0.296373 | -0.345645 | 0.348815 | -0.061381 | -0.272121 | 0.413541 | 0.474275 | 0.383088 | 0.562482 | 1.000000 | 0.643846 | 0.740141 | -0.487361 |
| ac025_ | -0.364573 | -0.329618 | -0.354189 | 0.350667 | -0.070154 | -0.296930 | 0.437354 | 0.500598 | 0.402166 | 0.557246 | 0.643846 | 1.000000 | 0.761609 | -0.516346 |
| Factor_1 | -0.419530 | -0.395943 | -0.456781 | 0.473680 | -0.084269 | -0.320230 | 0.621347 | 0.716329 | 0.547563 | 0.677795 | 0.740141 | 0.761609 | 1.000000 | -0.690235 |
| Factor_2 | 0.640906 | 0.746140 | 0.712579 | -0.372979 | 0.316507 | 0.447540 | -0.310559 | -0.374348 | -0.275128 | -0.517910 | -0.487361 | -0.516346 | -0.690235 | 1.000000 |
corr[corr['Factor_1'] > 0]['Factor_1'].sort_values(ascending=False)
Factor_1 1.000000 ac025_ 0.761609 ac024_ 0.740141 ac021_ 0.716329 ac023_ 0.677795 ac020_ 0.621347 ac022_ 0.547563 ac017_ 0.473680 Name: Factor_1, dtype: float64
corr[corr['Factor_2'] > 0]['Factor_2'].sort_values(ascending=False)
Factor_2 1.000000 ac015_ 0.746140 ac016_ 0.712579 ac014_ 0.640906 ac019_ 0.447540 ac018_ 0.316507 Name: Factor_2, dtype: float64
factors_df['country'] = df['country']
factors_df.columns = ['MR1', 'MR2', 'country']
factors_df.groupby('country').mean()
| MR1 | MR2 | |
|---|---|---|
| country | ||
| Austria | 0.325693 | -0.457261 |
| Belgium | 0.201459 | -0.122745 |
| Bulgaria | -0.564852 | 0.433664 |
| Croatia | -0.133694 | 0.219955 |
| Cyprus | -0.094447 | 0.293831 |
| Czech Republic | -0.170374 | 0.225735 |
| Denmark | 0.585978 | -0.394890 |
| Estonia | -0.300666 | 0.034354 |
| Finland | 0.222840 | 0.019432 |
| France | 0.167900 | -0.067557 |
| Germany | 0.302761 | -0.253279 |
| Greece | -0.535937 | 0.610024 |
| Hungary | -0.399798 | 0.025018 |
| Israel | 0.061645 | 0.123982 |
| Italy | -0.154268 | 0.278524 |
| Latvia | -0.536892 | 0.123840 |
| Lithuania | -0.410271 | 0.608445 |
| Luxembourg | 0.370236 | -0.416786 |
| Malta | 0.128402 | -0.071670 |
| Poland | 0.003619 | -0.056039 |
| Portugal | -0.391569 | 0.208524 |
| Romania | NaN | NaN |
| Slovakia | NaN | NaN |
| Slovenia | 0.093621 | -0.139435 |
| Spain | -0.156529 | -0.016771 |
| Sweden | 0.186603 | -0.242022 |
| Switzerland | 0.513112 | -0.385994 |
fig = px.choropleth(factors_df.groupby('country').mean().reset_index(), locations="country",
locationmode='country names', color="MR1",
hover_name="country", scope ='europe', color_continuous_scale="blues",
title='MR1mean')
fig.update(layout_coloraxis_showscale=True)
fig.show()
fig = px.choropleth(factors_df.groupby('country').median().reset_index(), locations="country",
locationmode='country names', color="MR1",
hover_name="country", scope ='europe', color_continuous_scale="blues",
title='MR1median')
fig.update(layout_coloraxis_showscale=True)
fig.show()
fig = px.choropleth(factors_df.groupby('country').mean().reset_index(), locations="country",
locationmode='country names', color="MR2",
hover_name="country", scope ='europe', color_continuous_scale="blues",
title='MR2mean')
fig.update(layout_coloraxis_showscale=True)
fig.show()
fig = px.choropleth(factors_df.groupby('country').median().reset_index(), locations="country",
locationmode='country names', color="MR2",
hover_name="country", scope ='europe', color_continuous_scale="blues",
title='MR2median')
fig.update(layout_coloraxis_showscale=True)
fig.show()
wq, wq_meta = read_spss('wq')
wq_desc = get_desc_df(wq, wq_meta)
wq_desc[wq_desc['varlab'].str.contains('press')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 9 | wq018_1 | Work has heavy time pressure | double | 7 | 72555 |
| 24 | wq018_2 | Work has heavy time pressure | double | 6 | 72755 |
| 39 | wq018_3 | Work has heavy time pressure | double | 7 | 74064 |
| 54 | wq018_4 | Work has heavy time pressure | double | 7 | 75120 |
| 69 | wq018_5 | Work has heavy time pressure | double | 7 | 75986 |
| 84 | wq018_6 | Work has heavy time pressure | double | 6 | 76442 |
| 99 | wq018_7 | Work has heavy time pressure | double | 7 | 76772 |
| 114 | wq018_8 | Work has heavy time pressure | double | 6 | 76939 |
| 129 | wq018_9 | Work has heavy time pressure | double | 6 | 77070 |
| 144 | wq018_10 | Work has heavy time pressure | double | 6 | 77129 |
| 159 | wq018_11 | Work has heavy time pressure | double | 5 | 77168 |
| 174 | wq018_12 | Work has heavy time pressure | double | 5 | 77179 |
| 189 | wq018_13 | Work has heavy time pressure | double | 5 | 77190 |
| 204 | wq018_14 | Work has heavy time pressure | double | 5 | 77195 |
| 219 | wq018_15 | Work has heavy time pressure | double | 4 | 77197 |
| 234 | wq018_16 | Work has heavy time pressure | double | 2 | 77201 |
| 249 | wq018_18 | Work has heavy time pressure | double | 2 | 77201 |
| 264 | wq004_ | Work had heavy time pressure | double | 7 | 38765 |
wq[[i for i in wq if 'wq018' in i]].info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 77202 entries, 0 to 77201 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 wq018_1 4647 non-null category 1 wq018_2 4447 non-null category 2 wq018_3 3138 non-null category 3 wq018_4 2082 non-null category 4 wq018_5 1216 non-null category 5 wq018_6 760 non-null category 6 wq018_7 430 non-null category 7 wq018_8 263 non-null category 8 wq018_9 132 non-null category 9 wq018_10 73 non-null category 10 wq018_11 34 non-null category 11 wq018_12 23 non-null category 12 wq018_13 12 non-null category 13 wq018_14 7 non-null category 14 wq018_15 5 non-null category 15 wq018_16 1 non-null category 16 wq018_18 1 non-null category dtypes: category(17) memory usage: 1.3 MB
wq['wq004_'].value_counts()
Agree 12994 Disagree 12898 Strongly agree 6443 Strongly disagree 5152 Don't know 929 Refusal 21 Name: wq004_, dtype: int64
wq = wq.apply(
lambda x: x.apply(
lambda y: np.nan if (y == "Don't know") or (y == "Refusal") else y
)
)
wq['wq004_'].value_counts()
Agree 12994 Disagree 12898 Strongly agree 6443 Strongly disagree 5152 Name: wq004_, dtype: int64
poor_jq_prep = wq[['mergeid', 'wq004_']]
poor_jq_prep.columns = ['mergeid', 'pressure']
wq_desc[wq_desc['varlab'].str.contains('emo')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 10 | wq019_1 | Work is emotionally demanding | double | 7 | 72555 |
| 25 | wq019_2 | Work is emotionally demanding | double | 7 | 72755 |
| 40 | wq019_3 | Work is emotionally demanding | double | 7 | 74064 |
| 55 | wq019_4 | Work is emotionally demanding | double | 7 | 75120 |
| 70 | wq019_5 | Work is emotionally demanding | double | 7 | 75986 |
| 85 | wq019_6 | Work is emotionally demanding | double | 6 | 76442 |
| 100 | wq019_7 | Work is emotionally demanding | double | 7 | 76772 |
| 115 | wq019_8 | Work is emotionally demanding | double | 6 | 76939 |
| 130 | wq019_9 | Work is emotionally demanding | double | 6 | 77070 |
| 145 | wq019_10 | Work is emotionally demanding | double | 6 | 77129 |
| 160 | wq019_11 | Work is emotionally demanding | double | 5 | 77168 |
| 175 | wq019_12 | Work is emotionally demanding | double | 5 | 77179 |
| 190 | wq019_13 | Work is emotionally demanding | double | 5 | 77190 |
| 205 | wq019_14 | Work is emotionally demanding | double | 4 | 77195 |
| 220 | wq019_15 | Work is emotionally demanding | double | 4 | 77197 |
| 235 | wq019_16 | Work is emotionally demanding | double | 2 | 77201 |
| 250 | wq019_18 | Work is emotionally demanding | double | 2 | 77201 |
| 265 | wq005_ | Work was emotionally demanding | double | 7 | 38765 |
poor_jq_prep['emo_demanding'] = wq['wq005_']
<ipython-input-65-1dfa862f0fc4>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
wq_desc[wq_desc['varlab'].str.contains('confl')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 11 | wq020_1 | Work involves conflicts | double | 7 | 72555 |
| 26 | wq020_2 | Work involves conflicts | double | 7 | 72755 |
| 41 | wq020_3 | Work involves conflicts | double | 7 | 74064 |
| 56 | wq020_4 | Work involves conflicts | double | 7 | 75120 |
| 71 | wq020_5 | Work involves conflicts | double | 7 | 75986 |
| 86 | wq020_6 | Work involves conflicts | double | 6 | 76442 |
| 101 | wq020_7 | Work involves conflicts | double | 6 | 76772 |
| 116 | wq020_8 | Work involves conflicts | double | 5 | 76939 |
| 131 | wq020_9 | Work involves conflicts | double | 6 | 77070 |
| 146 | wq020_10 | Work involves conflicts | double | 6 | 77129 |
| 161 | wq020_11 | Work involves conflicts | double | 5 | 77168 |
| 176 | wq020_12 | Work involves conflicts | double | 5 | 77179 |
| 191 | wq020_13 | Work involves conflicts | double | 5 | 77190 |
| 206 | wq020_14 | Work involves conflicts | double | 5 | 77195 |
| 221 | wq020_15 | Work involves conflicts | double | 3 | 77197 |
| 236 | wq020_16 | Work involves conflicts | double | 2 | 77201 |
| 251 | wq020_18 | Work involves conflicts | double | 2 | 77201 |
| 266 | wq006_ | Work involved conflicts | double | 7 | 38765 |
poor_jq_prep['conflicts'] = wq['wq006_']
<ipython-input-67-1aba32a59121>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
wq_desc[wq_desc['var'].str.contains('wq00')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 262 | wq002_ | Work was physically demanding | double | 7 | 38765 |
| 263 | wq003_ | Work was uncomfortable | double | 7 | 38765 |
| 264 | wq004_ | Work had heavy time pressure | double | 7 | 38765 |
| 265 | wq005_ | Work was emotionally demanding | double | 7 | 38765 |
| 266 | wq006_ | Work involved conflicts | double | 7 | 38765 |
| 267 | wq007_ | Work had little freedom to decide | double | 7 | 38765 |
| 268 | wq008_ | Work allowed development of skills | double | 7 | 38765 |
| 269 | wq009_ | Work gave recognition | double | 7 | 38766 |
poor_jq_prep['phys_demanding'] = wq['wq002_']
poor_jq_prep['uncomfortable'] = wq['wq003_']
<ipython-input-69-dea2890a9b43>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-69-dea2890a9b43>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
poor_jq_prep['security'] = wq['wq014_']
poor_jq_prep['negative_freedom'] = wq['wq007_']
poor_jq_prep['skills'] = wq['wq008_']
poor_jq_prep['support'] = wq['wq011_']
poor_jq_prep['recognition'] = wq['wq009_']
<ipython-input-70-aabcd66dbf26>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-70-aabcd66dbf26>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-70-aabcd66dbf26>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-70-aabcd66dbf26>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-70-aabcd66dbf26>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
poor_jq_prep = poor_jq_prep.apply(
lambda x: x.apply(
lambda y: np.nan if (y == "Don't know") or (y == "Refusal") else y
)
)
poor_jq_prep.dropna(inplace = True)
len(set(poor_jq_prep['mergeid'])) == len(poor_jq_prep['mergeid'])
True
for i in poor_jq_prep:
if i == 'mergeid':
continue
print(i)
print(set(poor_jq_prep[i]))
pressure
{'Strongly agree', 'Agree', 'Strongly disagree', 'Disagree'}
emo_demanding
{'Strongly agree', 'Agree', 'Strongly disagree', 'Disagree'}
conflicts
{'Strongly agree', 'Agree', 'Strongly disagree', 'Disagree'}
phys_demanding
{'Agree', 'Strongly agree', 'Strongly disagree', 'Disagree'}
uncomfortable
{'Strongly agree', 'Agree', 'Strongly disagree', 'Disagree'}
security
{'Agree', 'Strongly agree', 'Strongly disagree', 'Disagree'}
negative_freedom
{'Agree', 'Strongly agree', 'Strongly disagree', 'Disagree'}
skills
{'Strongly agree', 'Agree', 'Strongly disagree', 'Disagree'}
support
{'Strongly agree', 'Agree', 'Strongly disagree', 'Disagree'}
recognition
{'Agree', 'Strongly agree', 'Strongly disagree', 'Disagree'}
for i in poor_jq_prep:
poor_jq_prep[i] = poor_jq_prep[i].apply(lambda x: 1 if x == 'Strongly disagree' else x)\
.apply(lambda x: 2 if x == 'Disagree' else x)\
.apply(lambda x: 3 if x == 'Agree' else x)\
.apply(lambda x: 4 if x == 'Strongly agree' else x)
for i in poor_jq_prep:
if i == 'mergeid':
continue
print(i)
print(poor_jq_prep[i].value_counts())
pressure 3 12522 2 12324 4 6233 1 4855 Name: pressure, dtype: int64 emo_demanding 3 12241 2 11528 4 6385 1 5780 Name: emo_demanding, dtype: int64 conflicts 2 16066 1 10059 3 7413 4 2396 Name: conflicts, dtype: int64 phys_demanding 3 11466 4 10012 2 9141 1 5315 Name: phys_demanding, dtype: int64 uncomfortable 2 12154 3 9621 1 8658 4 5501 Name: uncomfortable, dtype: int64 security 3 17811 2 7306 4 6905 1 3912 Name: security, dtype: int64 negative_freedom 2 12817 3 10455 1 7879 4 4783 Name: negative_freedom, dtype: int64 skills 3 16062 4 8980 2 7744 1 3148 Name: skills, dtype: int64 support 3 20102 2 6892 4 6773 1 2167 Name: support, dtype: int64 recognition 3 17925 4 9119 2 6606 1 2284 Name: recognition, dtype: int64
def poor_jq_gen(pressure, emo_demanding, conflicts, phys_demanding, unconfortable,
freedom, skills, support, recognition, security):
effort = sum([pressure, emo_demanding, conflicts, phys_demanding, unconfortable])/5
reward = sum([freedom, skills, support, recognition, security])/5
return effort/reward
def bad_stress_gen(pressure, emo_demanding, conflicts,
freedom, skills, support, recognition, security):
stress = sum([pressure, emo_demanding, conflicts])/3
reward = sum([freedom, skills, support, recognition, security])/5
return stress/reward
poor_jq_prep['freedom'] = poor_jq_prep['negative_freedom'].apply(lambda x: 5-x)
poor_jq_variable = poor_jq_prep.apply(lambda row: poor_jq_gen(row['pressure'], row['emo_demanding'], row['conflicts'], row['phys_demanding'], row['uncomfortable'],
row['freedom'],row['skills'],row['support'],row['recognition'],row['security'])
,axis = 1)
bad_stress_variable = poor_jq_prep.apply(lambda row: bad_stress_gen(row['pressure'], row['emo_demanding'], row['conflicts'],
row['freedom'],row['skills'],row['support'],row['recognition'],row['security'])
,axis = 1)
poor_jq_prep['poor_jq'] = poor_jq_variable
poor_jq_prep['bad_stress'] = bad_stress_variable
wq_desc[wq_desc['varlab'].str.contains('salary')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 15 | wq024_1 | Work has adequate salary | double | 7 | 72555 |
| 30 | wq024_2 | Work has adequate salary | double | 7 | 72755 |
| 45 | wq024_3 | Work has adequate salary | double | 7 | 74064 |
| 60 | wq024_4 | Work has adequate salary | double | 7 | 75120 |
| 75 | wq024_5 | Work has adequate salary | double | 7 | 75986 |
| 90 | wq024_6 | Work has adequate salary | double | 6 | 76442 |
| 105 | wq024_7 | Work has adequate salary | double | 7 | 76772 |
| 120 | wq024_8 | Work has adequate salary | double | 6 | 76939 |
| 135 | wq024_9 | Work has adequate salary | double | 6 | 77070 |
| 150 | wq024_10 | Work has adequate salary | double | 6 | 77129 |
| 165 | wq024_11 | Work has adequate salary | double | 5 | 77168 |
| 180 | wq024_12 | Work has adequate salary | double | 5 | 77179 |
| 195 | wq024_13 | Work has adequate salary | double | 3 | 77190 |
| 210 | wq024_14 | Work has adequate salary | double | 3 | 77195 |
| 225 | wq024_15 | Work has adequate salary | double | 5 | 77197 |
| 240 | wq024_16 | Work has adequate salary | double | 2 | 77201 |
| 255 | wq024_18 | Work has adequate salary | double | 2 | 77201 |
| 270 | wq010_ | Work had adequate salary | double | 7 | 38765 |
Will swap wq024 with wq010_ due to redused number of NAs
wq024 = wq[['mergeid', 'wq010_']]
wq024['wq010_'] = wq024['wq010_'].apply(lambda x: 1 if x == 'Strongly disagree' else x)\
.apply(lambda x: 2 if x == 'Disagree' else x)\
.apply(lambda x: 3 if x == 'Agree' else x)\
.apply(lambda x: 4 if x == 'Strongly agree' else x)
<ipython-input-83-b691e2a40a59>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
wq024.dropna()
| mergeid | wq010_ | |
|---|---|---|
| 0 | AT-001215-01 | 2.0 |
| 2 | AT-001492-02 | 4.0 |
| 3 | AT-001881-01 | 3.0 |
| 4 | AT-001881-02 | 3.0 |
| 5 | AT-002136-01 | 3.0 |
| ... | ... | ... |
| 77196 | SK-993822-02 | 3.0 |
| 77197 | SK-995042-01 | 4.0 |
| 77198 | SK-995042-02 | 4.0 |
| 77200 | SK-999958-01 | 2.0 |
| 77201 | SK-999958-02 | 3.0 |
37361 rows × 2 columns
df_case3 = wq024.merge(poor_jq_prep, on='mergeid')
df_case3 = df_case3[['mergeid', 'wq010_', 'poor_jq', 'bad_stress']]
df_from_activities_needed = df[['mergeid', 'ac035d1', 'ac035d4', 'ac035d7', 'ac035d8', 'ac035d9', 'ac035d10', 'ac035dno']]
df_case3 = df_case3.merge(df_from_activities_needed, on='mergeid')
ep = ep[ep['mergeid'].isin(df_case3['mergeid'])]
ep_desc = get_desc_df(ep, ep_meta)
ep_desc.to_csv('employment_desc.csv')
ep_desc[ep_desc['varlab'].str.contains('industry')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 15 | ep018_ | Kind of industry working in | double | 1 | 32949 |
| 34 | ep054_ | Kind of industry working in last job | double | 1 | 32949 |
ep.shape
(32949, 436)
Too much NAs, switching to RE module
re, re_meta = pyreadstat.read_sav('sharew7_rel8-0-0_re.sav', apply_value_formats=True,
formats_as_category=True, formats_as_ordered_category=False)
re_desc = get_desc_df(re, re_meta)
re = re[re['mergeid'].isin(df_case3['mergeid'])]
re_desc[re_desc['varlab'].str.contains('industry')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 43 | re014_1 | Job industry | double | 17 | 17962 |
| 59 | re014_2 | Job industry | double | 17 | 37330 |
| 75 | re014_3 | Job industry | double | 17 | 53139 |
| 91 | re014_4 | Job industry | double | 17 | 63405 |
| 107 | re014_5 | Job industry | double | 17 | 69561 |
| 123 | re014_6 | Job industry | double | 17 | 72925 |
| 139 | re014_7 | Job industry | double | 16 | 74843 |
| 155 | re014_8 | Job industry | double | 17 | 75932 |
| 171 | re014_9 | Job industry | double | 16 | 76553 |
| 187 | re014_10 | Job industry | double | 16 | 76866 |
| 203 | re014_11 | Job industry | double | 15 | 77023 |
| 219 | re014_12 | Job industry | double | 14 | 77096 |
| 232 | re014_13 | Job industry | double | 15 | 77143 |
| 246 | re014_14 | Job industry | double | 13 | 77168 |
| 262 | re014_15 | Job industry | double | 9 | 77182 |
| 274 | re014_16 | Job industry | double | 7 | 77191 |
| 286 | re014_17 | Job industry | double | 6 | 77195 |
| 295 | re014_18 | Job industry | double | 4 | 77197 |
| 307 | re014_19 | Job industry | double | 5 | 77198 |
| 316 | re014_20 | Job industry | double | 4 | 77199 |
industry = re['re014_1']
re_desc[re_desc['varlab'].str.contains('empl')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 44 | re015_1 | Was employee civil servant or self | double | 6 | 17962 |
| 48 | re023_1 | First monthly work income in self-employment | double | 136 | 75284 |
| 51 | re027_1 | Current wage if still employed | double | 383 | 72936 |
| 53 | re029_1 | Current work income if still self-employed | double | 61 | 76824 |
| 60 | re015_2 | Was employee civil servant or self | double | 6 | 37330 |
| ... | ... | ... | ... | ... | ... |
| 287 | re015_17 | Was employee civil servant or self | double | 3 | 77195 |
| 296 | re015_18 | Was employee civil servant or self | double | 3 | 77197 |
| 301 | re027_18 | Current wage if still employed | double | 77202 | 77201 |
| 308 | re015_19 | Was employee civil servant or self | double | 3 | 77198 |
| 317 | re015_20 | Was employee civil servant or self | double | 3 | 77199 |
63 rows × 5 columns
type_of_employment = re['re015_1']
re_desc[re_desc['varlab'].str.contains('full')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 6 | re002_ | Year finished fulltime education | double | 100 | 14113 |
| 45 | re016_1 | Job was part or full time | double | 8 | 17962 |
| 61 | re016_2 | Job was part or full time | double | 8 | 37330 |
| 77 | re016_3 | Job was part or full time | double | 8 | 53139 |
| 93 | re016_4 | Job was part or full time | double | 8 | 63405 |
| 109 | re016_5 | Job was part or full time | double | 8 | 69561 |
| 125 | re016_6 | Job was part or full time | double | 7 | 72925 |
| 141 | re016_7 | Job was part or full time | double | 7 | 74843 |
| 157 | re016_8 | Job was part or full time | double | 8 | 75932 |
| 173 | re016_9 | Job was part or full time | double | 6 | 76553 |
| 189 | re016_10 | Job was part or full time | double | 7 | 76866 |
| 205 | re016_11 | Job was part or full time | double | 6 | 77023 |
| 221 | re016_12 | Job was part or full time | double | 5 | 77096 |
| 234 | re016_13 | Job was part or full time | double | 5 | 77143 |
| 248 | re016_14 | Job was part or full time | double | 4 | 77168 |
| 264 | re016_15 | Job was part or full time | double | 3 | 77182 |
| 276 | re016_16 | Job was part or full time | double | 3 | 77191 |
| 288 | re016_17 | Job was part or full time | double | 2 | 77195 |
| 297 | re016_18 | Job was part or full time | double | 2 | 77197 |
| 309 | re016_19 | Job was part or full time | double | 3 | 77198 |
| 318 | re016_20 | Job was part or full time | double | 2 | 77199 |
full_part_time_job = re['re016_1']
employement = pd.DataFrame({
"mergeid": re['mergeid'],
"type_of_emp":type_of_employment,
"full_part":full_part_time_job,
"industry":industry
})
employement = employement.apply(
lambda x: x.apply(
lambda y: np.nan if (y == "Don't know") or (y == "Refusal") else y
)
)
employement = employement.join(pd.get_dummies(employement[['type_of_emp', 'full_part', 'industry']]))
employement = employement[[i for i in employement.columns if i not in ['type_of_emp', 'full_part', 'industry']]]
employement
| mergeid | type_of_emp_Civil servant (public sector) | type_of_emp_Employee (private sector) | type_of_emp_Self-employed (including working for family business) | full_part_Always full-time | full_part_Always part-time | full_part_Changed multiple times | full_part_Changed once from full-time to part-time | full_part_Changed once from part-time to full-time | industry_Agriculture, hunting, forestry, fishing | ... | industry_Financial intermediation | industry_Health and social work | industry_Hotels and restaurants | industry_Manufacturing | industry_Mining and quarrying | industry_Other community | industry_Public administration and defence | industry_Real estate, renting and business activities | industry_Transport, storage and communication | industry_Wholesale and retail trade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | AT-001492-02 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | AT-001881-01 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | AT-001881-02 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 5 | AT-002136-01 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | AT-002136-03 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 77196 | SK-993822-02 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 77197 | SK-995042-01 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 77198 | SK-995042-02 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 77200 | SK-999958-01 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 77201 | SK-999958-02 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
32949 rows × 23 columns
df_case3 = df_case3.merge(employement, on='mergeid')
socio_df = gv_imputations[['mergeid', 'age', 'gender', 'yedu']]
socio_df = socio_df.join(pd.get_dummies(socio_df['gender']))
socio_df['age'].plot.hist()
plt.title('Age')
Text(0.5, 1.0, 'Age')
# create a list of our conditions
conditions = [
((socio_df['age'] >= 50.0) & (socio_df['age'] <= 54.0)),
((socio_df['age'] >= 55.0) & (socio_df['age'] <= 59.0)),
((socio_df['age'] >= 60.0) & (socio_df['age'] <= 64.0)),
]
# create a list of the values we want to assign for each condition
values = ['age_50_54', 'age_55_59', 'age_60_64']
# create a new column and use np.select
socio_df['age_cat'] = np.select(conditions, values)
socio_df = socio_df[socio_df['age_cat'] != '0']
gv_aim
| age | gender | single | mstat | yedu | cjs | thinc2 | |
|---|---|---|---|---|---|---|---|
| mergeid | |||||||
| AT-001215-01 | 78.0 | Female | Yes | Never married | 15.0 | Retired | 14400.0 |
| AT-001492-01 | 65.0 | Female | No | Married, living with spouse | 11.0 | Homemaker | 24000.0 |
| AT-001492-02 | 66.0 | Male | No | Married, living with spouse | 13.0 | Retired | 24000.0 |
| AT-001881-01 | 87.0 | Female | Yes | Widowed | 12.0 | Retired | 17400.0 |
| AT-001881-02 | 93.0 | Male | Yes | Married, living with spouse | 3.0 | Retired | 15600.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| SK-995042-01 | 59.0 | Female | No | Married, living with spouse | 10.0 | Retired | 15600.0 |
| SK-995042-02 | 62.0 | Male | No | Married, living with spouse | 10.0 | Retired | 15600.0 |
| SK-996004-01 | 56.0 | Female | Yes | Divorced | 13.0 | Employed or self-employed | 10596.0 |
| SK-999958-01 | 71.0 | Male | No | Married, living with spouse | 11.0 | Retired | 10800.0 |
| SK-999958-02 | 72.0 | Female | No | Married, living with spouse | 11.0 | Retired | 10800.0 |
76377 rows × 7 columns
socio_df = socio_df.merge(gv_aim.reset_index()[['mergeid','thinc2']], on='mergeid')
socio_df['thinc2'].plot.hist()
plt.title('Income')
Text(0.5, 1.0, 'Income')
socio_df[['thinc2']].describe(percentiles=[0.33, 0.66])
| thinc2 | |
|---|---|
| count | 142831.000000 |
| mean | 26393.808218 |
| std | 26502.360081 |
| min | 0.000000 |
| 33% | 12000.000000 |
| 50% | 18000.000000 |
| 66% | 27600.000000 |
| max | 403301.693123 |
# create a list of our conditions
conditions = [
((socio_df['thinc2'] >= 0) & (socio_df['thinc2'] <= 12000)),
((socio_df['thinc2'] >= 12000) & (socio_df['thinc2'] <= 27600)),
((socio_df['thinc2'] >= 27600)),
]
# create a list of the values we want to assign for each condition
values = ['income_low', 'income_med', 'income_high']
# create a new column and use np.select
socio_df['Income_cat'] = np.select(conditions, values)
socio_df['yedu'] = socio_df['yedu'].apply(lambda x: np.nan if isinstance(x, str) else x)
socio_df['yedu'].describe(percentiles=[0.33, 0.66])
count 142087.000000 mean 11.997062 std 3.793074 min 1.000000 33% 11.000000 50% 12.000000 66% 13.000000 max 35.000000 Name: yedu, dtype: float64
# create a list of our conditions
conditions = [
((socio_df['yedu'] >= 0) & (socio_df['yedu'] <= 11)),
((socio_df['yedu'] >= 12) & (socio_df['yedu'] <= 14)),
((socio_df['yedu'] > 14)),
]
# create a list of the values we want to assign for each condition
values = ['edu_low', 'edu_med', 'edu_high']
# create a new column and use np.select
socio_df['edu_cat'] = np.select(conditions, values)
socio_df['edu_cat'] = socio_df['edu_cat'].apply(lambda x: np.nan if x == '0' else x)
socio_df['edu_cat'].value_counts()
edu_low 59403 edu_med 50036 edu_high 32648 Name: edu_cat, dtype: int64
socio_df['Income_cat'].value_counts()
income_low 50113 income_high 47670 income_med 45048 Name: Income_cat, dtype: int64
socio_df = socio_df.join(pd.get_dummies(socio_df[['edu_cat','Income_cat', 'age_cat']]))
socio_df = socio_df[['mergeid',
'Female', 'Male',
'edu_cat_edu_high', 'edu_cat_edu_low', 'edu_cat_edu_med',
'Income_cat_income_high', 'Income_cat_income_low', 'Income_cat_income_med',
'age_cat_age_50_54', 'age_cat_age_55_59', 'age_cat_age_60_64']]
df_case3 = df_case3.merge(socio_df, on='mergeid')
hh, hh_meta = read_spss('gv_health')
hh = hh.apply(
lambda x: x.apply(
lambda y: np.nan if (y == "Don't know") or (y == "Refusal") else y
)
)
health = hh[['mergeid','gali', 'sphus']]
hh_desc = get_desc_df(hh, hh_meta)
hh_desc[hh_desc['varlab'].str.contains('chronic')]
| var | varlab | values | uniques | nas | |
|---|---|---|---|---|---|
| 13 | chronic2w7 | 2+ chronic diseases (w7 version) | double | 3 | 536 |
| 14 | chronicw7c | Number of chronic diseases (w7 version) | double | 550 | 536 |
hh['chronic2w7'].value_counts()
2+ chronic diseases 39451 Less than 2 diseases 37215 Name: chronic2w7, dtype: int64
health['chronic2'] = np.where(hh['chronic2w7']== '2+ chronic diseases', 1, 0)
<ipython-input-132-4c042e694e72>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Will take the current version of health df as I do not have time to compile all of them
health['gali'] = np.where(hh['gali']== 'Limited', 1, 0)
<ipython-input-133-48a2dc68b2fe>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
health['sphus'] = health['sphus'].apply(lambda x: 1 if x == 'Poor' else x)\
.apply(lambda x: 2 if x == 'Fair' else x)\
.apply(lambda x: 3 if x == 'Good' else x)\
.apply(lambda x: 4 if x == 'Very good' else x)\
.apply(lambda x: 5 if x == 'Excellent' else x)
<ipython-input-134-702b6ec16249>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_case3 = df_case3.merge(health, on='mergeid')
df_case3 = df_case3.merge(df[['mergeid']].join(corr_df[['Factor_1', 'Factor_2']]), on='mergeid')
df_case3.dropna(inplace=True)
from sklearn.model_selection import train_test_split
X, y = df_case3[[i for i in df_case3.columns if i != 'mergeid' and i !='Factor_1' and i != 'Factor_2']].values,\
df_case3['Factor_1'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn import linear_model
clf = linear_model.Lasso()
clf.fit(X_train, y_train)
Lasso()
pred = clf.predict(X)
clf.coef_
array([ 0., -0., -0., 0., 0., 0., 0., 0., 0., -0., -0., 0., -0.,
-0., -0., 0., 0., -0., -0., -0., 0., 0., 0., 0., -0., -0.,
-0., 0., 0., 0., -0., 0., 0., -0., 0., -0., 0., 0., -0.,
0., -0., -0., 0., -0., 0., -0.])
clf.score(X_test, y_test)
-3.8590865895304916e-05
X, y = df_case3[[i for i in df_case3.columns if i != 'mergeid' and i !='Factor_1' and i != 'Factor_2']].values,\
df_case3['Factor_2'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)
clf_2 = linear_model.Lasso()
clf_2.fit(X_train, y_train)
Lasso()
clf_2.coef_
array([-0., 0., 0., -0., -0., -0., -0., -0., -0., 0., 0., -0., 0.,
-0., 0., 0., -0., 0., 0., 0., -0., -0., -0., -0., 0., 0.,
0., 0., -0., -0., -0., -0., -0., 0., -0., 0., -0., -0., 0.,
-0., 0., 0., -0., 0., -0., 0.])
clf_2.score(X_test, y_test)
-0.00024748253322748326
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
X, y = df_case3[[i for i in df_case3.columns if i != 'mergeid' and i !='Factor_1' and i != 'Factor_2']].values,\
df_case3['Factor_1'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)
RandomForestRegressor(random_state=0)
rf.score(X_test, y_test)
0.985843241063699
pred = rf.predict(X_test)
plt.plot(y_test, pred,'.')
[<matplotlib.lines.Line2D at 0x2622a3e30a0>]
pd.DataFrame({"Features":df_case3[[i for i in df_case3.columns if i != 'mergeid' and i !='Factor_1' and i != 'Factor_2']].columns,
"Feature Importance":rf.feature_importances_})
| Features | Feature Importance | |
|---|---|---|
| 0 | wq010_ | 0.045297 |
| 1 | poor_jq | 0.156539 |
| 2 | bad_stress | 0.142503 |
| 3 | ac035d1 | 0.012429 |
| 4 | ac035d4 | 0.008372 |
| 5 | ac035d7 | 0.005430 |
| 6 | ac035d8 | 0.015287 |
| 7 | ac035d9 | 0.019444 |
| 8 | ac035d10 | 0.015679 |
| 9 | ac035dno | 0.020574 |
| 10 | type_of_emp_Civil servant (public sector) | 0.014248 |
| 11 | type_of_emp_Employee (private sector) | 0.011756 |
| 12 | type_of_emp_Self-employed (including working f... | 0.004019 |
| 13 | full_part_Always full-time | 0.005258 |
| 14 | full_part_Always part-time | 0.005569 |
| 15 | full_part_Changed multiple times | 0.002811 |
| 16 | full_part_Changed once from full-time to part-... | 0.003818 |
| 17 | full_part_Changed once from part-time to full-... | 0.001162 |
| 18 | industry_Agriculture, hunting, forestry, fishing | 0.013933 |
| 19 | industry_Construction | 0.014088 |
| 20 | industry_Education | 0.006266 |
| 21 | industry_Electricity, gas and water supply | 0.005208 |
| 22 | industry_Financial intermediation | 0.003351 |
| 23 | industry_Health and social work | 0.008285 |
| 24 | industry_Hotels and restaurants | 0.010073 |
| 25 | industry_Manufacturing | 0.020435 |
| 26 | industry_Mining and quarrying | 0.004760 |
| 27 | industry_Other community | 0.013255 |
| 28 | industry_Public administration and defence | 0.008006 |
| 29 | industry_Real estate, renting and business act... | 0.001080 |
| 30 | industry_Transport, storage and communication | 0.011074 |
| 31 | industry_Wholesale and retail trade | 0.015007 |
| 32 | Female | 0.011792 |
| 33 | Male | 0.011751 |
| 34 | edu_cat_edu_high | 0.009663 |
| 35 | edu_cat_edu_low | 0.014610 |
| 36 | edu_cat_edu_med | 0.016202 |
| 37 | Income_cat_income_high | 0.023604 |
| 38 | Income_cat_income_low | 0.020210 |
| 39 | Income_cat_income_med | 0.008405 |
| 40 | age_cat_age_50_54 | 0.010292 |
| 41 | age_cat_age_55_59 | 0.014039 |
| 42 | age_cat_age_60_64 | 0.015042 |
| 43 | gali | 0.018590 |
| 44 | sphus | 0.178691 |
| 45 | chronic2 | 0.022089 |
X, y = df_case3[[i for i in df_case3.columns if i != 'mergeid' and i !='Factor_1' and i != 'Factor_2']].values,\
df_case3['Factor_2'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)
rf_2 = RandomForestRegressor(random_state=2)
rf_2.fit(X_train, y_train)
RandomForestRegressor(random_state=2)
rf_2.score(X_test, y_test)
0.9902248312949842
pred = rf_2.predict(X_test)
plt.plot(y_test, pred,'.')
[<matplotlib.lines.Line2D at 0x2622a43b2b0>]
pd.DataFrame({"Features":df_case3[[i for i in df_case3.columns if i != 'mergeid' and i !='Factor_1' and i != 'Factor_2']].columns,
"Feature Importance":rf_2.feature_importances_})
| Features | Feature Importance | |
|---|---|---|
| 0 | wq010_ | 0.049591 |
| 1 | poor_jq | 0.173799 |
| 2 | bad_stress | 0.142549 |
| 3 | ac035d1 | 0.013811 |
| 4 | ac035d4 | 0.008390 |
| 5 | ac035d7 | 0.007451 |
| 6 | ac035d8 | 0.015849 |
| 7 | ac035d9 | 0.019101 |
| 8 | ac035d10 | 0.017099 |
| 9 | ac035dno | 0.011959 |
| 10 | type_of_emp_Civil servant (public sector) | 0.011984 |
| 11 | type_of_emp_Employee (private sector) | 0.012117 |
| 12 | type_of_emp_Self-employed (including working f... | 0.005328 |
| 13 | full_part_Always full-time | 0.005906 |
| 14 | full_part_Always part-time | 0.004883 |
| 15 | full_part_Changed multiple times | 0.002565 |
| 16 | full_part_Changed once from full-time to part-... | 0.003099 |
| 17 | full_part_Changed once from part-time to full-... | 0.001907 |
| 18 | industry_Agriculture, hunting, forestry, fishing | 0.013861 |
| 19 | industry_Construction | 0.012283 |
| 20 | industry_Education | 0.008587 |
| 21 | industry_Electricity, gas and water supply | 0.004085 |
| 22 | industry_Financial intermediation | 0.004859 |
| 23 | industry_Health and social work | 0.009912 |
| 24 | industry_Hotels and restaurants | 0.009256 |
| 25 | industry_Manufacturing | 0.020988 |
| 26 | industry_Mining and quarrying | 0.005739 |
| 27 | industry_Other community | 0.013880 |
| 28 | industry_Public administration and defence | 0.007714 |
| 29 | industry_Real estate, renting and business act... | 0.001312 |
| 30 | industry_Transport, storage and communication | 0.008238 |
| 31 | industry_Wholesale and retail trade | 0.016771 |
| 32 | Female | 0.012960 |
| 33 | Male | 0.012834 |
| 34 | edu_cat_edu_high | 0.010787 |
| 35 | edu_cat_edu_low | 0.015141 |
| 36 | edu_cat_edu_med | 0.015246 |
| 37 | Income_cat_income_high | 0.012171 |
| 38 | Income_cat_income_low | 0.014739 |
| 39 | Income_cat_income_med | 0.010061 |
| 40 | age_cat_age_50_54 | 0.009196 |
| 41 | age_cat_age_55_59 | 0.013799 |
| 42 | age_cat_age_60_64 | 0.016696 |
| 43 | gali | 0.017112 |
| 44 | sphus | 0.172217 |
| 45 | chronic2 | 0.022169 |